set.seed(12345)
source("ingest_data.R")
MODEL_CACHE <- "added-M_pop_A_team_team_repo_RDC_zi_sd025_gamma05"
Models are stored in the following directory, which must exist prior to knitting this document:
cat(normalizePath(paste(getwd(), dirname(cachefile(MODEL_CACHE)), sep="/"), mustWork = T))
## /home/app/.cache
The used cache directory can be controlled via the cache parameter to Rmd - it can be useful to experiment with this parameter if you Knit the document manually in RStudio.
Compared to the previous model, we now add two more predictors, existing complexity and existing number of clones in the file, prior to the change.
Both of these additional predictors have the same structure as the previous added and removed lines - they are both zero-bounded, and highly right-skewed. Therefore, we log-transform them, and center the resulting logarithm, so the value used in the model is zero-centered with standard deviation 1.
This means that we can use the same priors as we did for the prior model, \(\mathcal{M}_1\). The only difference is that now our model incorporates many more predictors.
d <- data |> select(y=INTROD,
A=A,
C=C,
D=D,
R=R,
team=committerteam,
repo=repo)
formula <- bf(y ~ 1 + A + R + C + D + (1 + R + C + D | team) + (1 + R + C + D | team:repo),
zi ~ 1 + A + R + C + D + (1 + R + C + D | team) + (1 + R + C + D | team:repo))
get_prior(data=d,
family=zero_inflated_negbinomial,
formula=formula)
## prior class coef group resp dpar nlpar lb ub
## (flat) b
## (flat) b A
## (flat) b C
## (flat) b D
## (flat) b R
## lkj(1) cor
## lkj(1) cor team
## lkj(1) cor team:repo
## student_t(3, -2.3, 2.5) Intercept
## student_t(3, 0, 2.5) sd 0
## student_t(3, 0, 2.5) sd team 0
## student_t(3, 0, 2.5) sd C team 0
## student_t(3, 0, 2.5) sd D team 0
## student_t(3, 0, 2.5) sd Intercept team 0
## student_t(3, 0, 2.5) sd R team 0
## student_t(3, 0, 2.5) sd team:repo 0
## student_t(3, 0, 2.5) sd C team:repo 0
## student_t(3, 0, 2.5) sd D team:repo 0
## student_t(3, 0, 2.5) sd Intercept team:repo 0
## student_t(3, 0, 2.5) sd R team:repo 0
## gamma(0.01, 0.01) shape 0
## (flat) b zi
## (flat) b A zi
## (flat) b C zi
## (flat) b D zi
## (flat) b R zi
## logistic(0, 1) Intercept zi
## student_t(3, 0, 2.5) sd zi 0
## student_t(3, 0, 2.5) sd team zi 0
## student_t(3, 0, 2.5) sd C team zi 0
## student_t(3, 0, 2.5) sd D team zi 0
## student_t(3, 0, 2.5) sd Intercept team zi 0
## student_t(3, 0, 2.5) sd R team zi 0
## student_t(3, 0, 2.5) sd team:repo zi 0
## student_t(3, 0, 2.5) sd C team:repo zi 0
## student_t(3, 0, 2.5) sd D team:repo zi 0
## student_t(3, 0, 2.5) sd Intercept team:repo zi 0
## student_t(3, 0, 2.5) sd R team:repo zi 0
## source
## default
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## default
## (vectorized)
## (vectorized)
## default
## default
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## default
## default
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## default
## default
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
priors <- c(prior(normal(0, 0.5), class = Intercept),
prior(normal(0, 0.25), class = b),
prior(weibull(2, 0.25), class = sd),
prior(weibull(2, 0.25), class = sd, group=team:repo),
prior(lkj(2), class = cor),
prior(normal(0, 0.5), class = Intercept, dpar=zi),
prior(normal(0, 0.25), class = b, dpar=zi),
prior(weibull(2, 0.25), class = sd, dpar=zi),
prior(weibull(2, 0.25), class = sd, group=team:repo, dpar=zi),
prior(gamma(0.5, 0.1), class = shape))
(v <- validate_prior(prior=priors,
formula=formula,
data=d,
family=zero_inflated_negbinomial)
)
## prior class coef group resp dpar nlpar lb ub
## normal(0, 0.25) b
## normal(0, 0.25) b A
## normal(0, 0.25) b C
## normal(0, 0.25) b D
## normal(0, 0.25) b R
## normal(0, 0.25) b zi
## normal(0, 0.25) b A zi
## normal(0, 0.25) b C zi
## normal(0, 0.25) b D zi
## normal(0, 0.25) b R zi
## normal(0, 0.5) Intercept
## normal(0, 0.5) Intercept zi
## lkj_corr_cholesky(2) L
## lkj_corr_cholesky(2) L team
## lkj_corr_cholesky(2) L team:repo
## weibull(2, 0.25) sd 0
## weibull(2, 0.25) sd zi 0
## weibull(2, 0.25) sd team 0
## weibull(2, 0.25) sd C team 0
## weibull(2, 0.25) sd D team 0
## weibull(2, 0.25) sd Intercept team 0
## weibull(2, 0.25) sd R team 0
## weibull(2, 0.25) sd team zi 0
## weibull(2, 0.25) sd C team zi 0
## weibull(2, 0.25) sd D team zi 0
## weibull(2, 0.25) sd Intercept team zi 0
## weibull(2, 0.25) sd R team zi 0
## weibull(2, 0.25) sd team:repo 0
## weibull(2, 0.25) sd C team:repo 0
## weibull(2, 0.25) sd D team:repo 0
## weibull(2, 0.25) sd Intercept team:repo 0
## weibull(2, 0.25) sd R team:repo 0
## weibull(2, 0.25) sd team:repo zi 0
## weibull(2, 0.25) sd C team:repo zi 0
## weibull(2, 0.25) sd D team:repo zi 0
## weibull(2, 0.25) sd Intercept team:repo zi 0
## weibull(2, 0.25) sd R team:repo zi 0
## gamma(0.5, 0.1) shape 0
## source
## user
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## user
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## user
## user
## user
## (vectorized)
## (vectorized)
## user
## user
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## user
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## user
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## user
The same structure of priors as in \(\mathcal{M}_1\) is used here—the only thing is that we have added more numerical predictors, scaled and centered like the
M_ppc <- brm(data = d,
family = zero_inflated_negbinomial,
formula = formula,
prior = priors,
file = cachefile(paste0("ppc-",MODEL_CACHE)),
warmup = 1000,
iter = ITERATIONS,
chains = CHAINS,
cores = CORES,
sample_prior = "only",
backend="cmdstanr",
file_refit = "on_change",
threads = threading(THREADS),
save_pars = SAVE_PARS,
adapt_delta = ADAPT_DELTA)
m <- M_ppc
yrep <- posterior_predict(m, newdata = d) # or nd
ppc_stat(y = d$y, yrep, stat = function(y) mean(y == 0)) + ggtitle("Prior predicted proportion of zeros")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The general pattern that we have seen in both \(\mathcal{M}_0\) and \(\mathcal{M}_1\) remains. Our model expects between \(40%\), up to \(100%\) zeros, with the peak around \(75%\), and lower values at the extremes (especially low below \(50%\)).
(sim_max <- ppc_stat(y = d$y, yrep, stat = "max") + ggtitle("Prior predicted max values")
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Relative to the prior models, the maximum value is now more wild—ranging into the tens of thousands, which is clearly unrealistic.
If we scale the plot to more reasonable values, we see that there is higher probability of 500 to 1000 introduced clones than in \(\mathcal{M}_1\), and the most likely value is still below the observed maximum. So the observed value still fits well within the range of predictions.
sim_max + scale_x_continuous(limits = c(0,1000)) + ggtitle("Prior predicted max values up to 1000")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Relative to \(\mathcal{M}_1\), our new model Q99 predictions range wider (up to 200, where \(\mathcal{M}_1\) had 100). But the general pattern fits the observations, and we know that, because we have more predictors, our model will swing more in its predictions.
ppc_stat(y = d$y, yrep, stat = function(y) quantile(y, 0.99)) + ggtitle("Prior predicted Q99 vs. observed value")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The same pattern repeats for the Q99-vs-Q95 plot. Somewhat wider predicted values, but still encompassing the observations.
p <- ppc_stat_2d(d$y, yrep, stat = c("q95", "q99")) + theme(legend.position="bottom") #+ ggtitle("Prior predicted Q95 vs Q99")
p
Instead of ranging up to 1200, the standard deviation now ranges to about 3000. We need to scale the \(x\)-axis to see the real pattern.
(p <- ppc_stat(y = d$y, yrep, stat = "sd") + ggtitle("Prior predicted stddev vs. observed value")
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Relative to \(\mathcal{M}_1\), this model has a wider standard deviation range, but it still encompass the observed value well.
p + scale_x_continuous(limits=c(0,30))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The group-level data is also wider than the prior model. The Brown team now ranges (based only on prior knowledge) up to 15000, and the Red team up to 4000.
ppc_stat_grouped(y = d$y, yrep, stat = "q99", group = d$team) + theme(legend.position = "bottom") + ggtitle("Prior predictive Q99 observation per team")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Also this model will show Integration tests to be a bit of an anomaly, with predictions (based only on prior knowledge) ranging up to 9000. Because we have not yet trained our model, these predictions indicate that something is different (e.g. more added or less removed lines, different complexity) in this repository, compared to the others.
ppc_stat_grouped(y = d$y, yrep, stat = "q99", group = d$repo) + theme(legend.position = "bottom") + ggtitle("Prior predictive Q99 observation per repository")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
M_pop_A_team_team_repo_zi_sd025_model <-
brm(data = d,
family = zero_inflated_negbinomial,
file = cachefile(MODEL_CACHE),
formula = formula,
prior = priors,
warmup = 1000,
iter = ITERATIONS,
chains = CHAINS,
cores = CORES,
backend="cmdstanr",
file_refit = "on_change",
threads = threading(THREADS),
save_pars = SAVE_PARS,
adapt_delta = ADAPT_DELTA)
M_pop_A_team_team_repo_zi_sd025_model <- add_criterion(M_pop_A_team_team_repo_zi_sd025_model, "loo")
m <- M_pop_A_team_team_repo_zi_sd025_model
p <- mcmc_trace(m)
pars <- levels(p[["data"]][["parameter"]])
plots <- seq(1, to=length(pars), by=12)
lapply(plots, function(i) {
start <- i
end <- start+11
mcmc_trace(m, pars = na.omit(pars[start:end]))
})
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
##
## [[19]]
##
## [[20]]
##
## [[21]]
##
## [[22]]
##
## [[23]]
##
## [[24]]
##
## [[25]]
##
## [[26]]
##
## [[27]]
##
## [[28]]
##
## [[29]]
##
## [[30]]
##
## [[31]]
##
## [[32]]
##
## [[33]]
##
## [[34]]
##
## [[35]]
##
## [[36]]
##
## [[37]]
##
## [[38]]
##
## [[39]]
##
## [[40]]
##
## [[41]]
##
## [[42]]
##
## [[43]]
##
## [[44]]
##
## [[45]]
##
## [[46]]
##
## [[47]]
##
## [[48]]
##
## [[49]]
##
## [[50]]
##
## [[51]]
##
## [[52]]
##
## [[53]]
##
## [[54]]
##
## [[55]]
##
## [[56]]
##
## [[57]]
##
## [[58]]
##
## [[59]]
##
## [[60]]
##
## [[61]]
##
## [[62]]
##
## [[63]]
##
## [[64]]
##
## [[65]]
##
## [[66]]
##
## [[67]]
##
## [[68]]
The “caterpillar plots” show that the chains mixed well also for this model.
mcmc_plot(m, type="rhat")
mcmc_plot(m, type="rhat_hist")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
mcmc_plot(m, type="neff")
mcmc_plot(m, type="neff_hist")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Both MCMC chains, \(\hat{R}\) and \(N_{eff}\) ratio looks good.
loo <- loo(m)
loo
##
## Computed from 12000 by 31007 log-likelihood matrix.
##
## Estimate SE
## elpd_loo -8487.8 163.2
## p_loo 259.6 16.8
## looic 16975.5 326.4
## ------
## MCSE of elpd_loo is NA.
## MCSE and ESS estimates assume MCMC draws (r_eff in [0.2, 1.5]).
##
## Pareto k diagnostic values:
## Count Pct. Min. ESS
## (-Inf, 0.7] (good) 30995 100.0% 192
## (0.7, 1] (bad) 9 0.0% <NA>
## (1, Inf) (very bad) 3 0.0% <NA>
## See help('pareto-k-diagnostic') for details.
plot(loo)
This, more complex, model also has more outliers that need to be analyzed with reloo. We now have 12 data points with PSIS-estimated Pareto-k value greater than the 0.7 threshold.
Reloo is disabled by default, but could be enabled by setting the RELOO environment variable (which in turn will set the reloo parameter of this document.
reloofile <- cachefile(paste0("reloo-", MODEL_CACHE, ".rds"))
if (file.exists(reloofile)) {
(reloo <- readRDS(reloofile))
} else {
Sys.time()
(reloo <- reloo(m, loo, chains=CHAINS, cores=CORES) )
Sys.time()
saveRDS(reloo, reloofile)
}
##
## Computed from 12000 by 31007 log-likelihood matrix.
##
## Estimate SE
## elpd_loo -8492.8 163.8
## p_loo 264.6 18.7
## looic 16985.5 327.5
## ------
## MCSE of elpd_loo is 0.8.
## MCSE and ESS estimates assume MCMC draws (r_eff in [0.2, 1.5]).
##
## All Pareto k estimates are good (k < 0.7).
## See help('pareto-k-diagnostic') for details.
Note that the reloo results are cached, and included in the docker image by default. If you want to reproduce the reloo manually, set a separate cache dir (and mount it into the docker image), and enable the RELOO environment variable. It will take more than one day to complete all reloo, so it is best done over a weekend.
# plotting the recalculated values
plot(reloo)
# which points have higher pareto_k than 0.5?
influencers <- data[loo::pareto_k_ids(reloo),]
influencers |> group_by(repo,committerteam) |> tally()
## # A tibble: 0 × 3
## # Groups: repo [0]
## # ℹ 3 variables: repo <fct>, committerteam <fct>, n <int>
According to the reloo function, all Pareto-k values are below 0.7, so we should be able to trust the model inferences.
yrep <- posterior_predict(m)
ppc_stat(y = d$y, yrep, stat = function(y) mean(y == 0))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The distribution of zeros is marginally higher than the observed value, but it still encompasses the observed proportion.
(sim_max <- ppc_stat(y = d$y, yrep, stat = "max")
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Our more complex model are similar to model \(\mathcal{M}_1\), but expects a slightly wider range of values, up to 7500 duplicates (\(\mathcal{M}_1\) had up to 6000). But the most likely value is still close to the observed max value (150).
sim_max + scale_x_continuous(limits = c(0,1000))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
When scaling to \(0--1000\), we see that the most likely prediction of our model is actually somewhat less than 150 (observed max value). But then the predictions drop off towards 750, or even 1000.
ppc_stat(y = d$y, yrep, stat = "sd")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Same principle as for the other metrics—the standard deviation is slightly wider than for model \(\mathcal{M}_1\).
ppc_stat(y = d$y, yrep, stat = function(y) quantile(y, 0.99)) + ggtitle("Posterior predicted Q99")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ppc_stat_2d(d$y, yrep, stat = c("q95", "q99")) + ggtitle("Posterior predicted Q95 vs Q99")
The 95th and 99th quantiles are spot on the observed values of 1 and 5, respectively. Even smaller variations than \(\mathcal{M}_1\) are expected for Q99 (the histograms for 4 and 6 are smaller, and more evenly balanced in this model).
ppc_stat_grouped(y = d$y, yrep, stat = "max", group = d$team) + ggtitle("Posterior predictive max observation per team")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The expected max value per team varies a lot between teams. For some teams (e.g. Arch, Red, Violet), the model is reasonably sure that it will fall between 150-400, while for other teams (Brown, Green, Orange), it would expect up to 8000)
ppc_stat_grouped(y = d$y, yrep, stat = "max", group = d$repo) + ggtitle("Posterior predictive max observation per repo")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Some repositories (rather, teams in some repositories, as our model do not consider repo-level variation separately), like Integration tests and Mars, expect up to 7500-8000 introduced clones. The others expect around 5-600, so an order of magnitude less.
ppc_stat_grouped(y = d$y, yrep, stat = "q99", group = d$team) + ggtitle("Posterior predictive 99% quartile per team")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The 99th percentile value predictions seem very well fitted. Predictions surround the observation nicely, and the UI team continue to be the most varying (given that they have not contributed much in the data set).
ppc_stat_grouped(y = d$y, yrep, stat = "q99", group = d$repo) + ggtitle("Posterior predictive 99% quartile per repo")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Similar for the repository-level variation–the predictions surround the observed Q99 value nicely.
(rootogram <- pp_check(m, type = "rootogram", style="suspended")
)
## Using all posterior draws for ppc type 'rootogram' by default.
We need to size to rootgram scale to see anything of value.
topleft=data.frame(x=c(0, 0),
xend=c(17, 17),
y=c(35, -4),
yend=c(22, 17))
main.plot <- rootogram + scale_x_continuous(limits = c(0,50), breaks = seq(from=0, to=50, by=5)) + scale_y_continuous(limits = c(-4,35), breaks = c(-5, 0, 5, 10, 15, 20, 25, 30, 35)) + geom_text(x=1, y=35, label="b)") + theme(legend.position="bottom")
inset.plot <- rootogram + theme(legend.position = "none") + scale_x_continuous(limits = c(0,150)) + geom_text(x=20, y=150, label="a)") + geom_rect(xmin=-3, xmax=50, ymin=-4, ymax=35, color="red", fill=NA)
p <- ggdraw() + draw_plot(main.plot) + draw_plot(inset.plot, x=0.25, y=.5, width = 0.5, height = 0.5)
## Warning: Removed 6611 rows containing missing values or values outside the scale range
## (`geom_tile()`).
## Warning: Removed 6610 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Warning: Removed 6511 rows containing missing values or values outside the scale range
## (`geom_tile()`).
## Warning: Removed 6509 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
p
We see that the suspended histogram “line up” around the \(x\)-axis, on both sides (like residuals usually do). This indicates that there is no systeming over- or under-predictions going on in our model, so we should be able to trust its predictions.
The posterior predictions in general work well. Though the single outlier in Saturn is not picked up (as most other changes there yield very few duplicates). Though our priors does allow the outliers to shine though, they do relegate them as very unlikely (most max predictions are in the 10s or 20s, most).
source("predict_utils.R")
p <- heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q95(data$ADD), removed=q95(data$DEL), complexity = mean(data$COMPLEX), duplicates = mean(data$DUP), summary = function(x) { length(which(x>0))/length(x) }), "Probability of introduced duplicates", decimals=2)
p
p <- heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q95(data$ADD), removed=q95(data$DEL), complexity = q99(data$COMPLEX), duplicates = q99(data$DUP), summary = function(x) { length(which(x>0))/length(x) }), "Probability of introduced duplicates", decimals=2)
p
p <- heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q95(data$ADD), removed=q95(data$DEL), complexity = median(data$COMPLEX), duplicates = median(data$DUP), summary = function(x) { length(which(x>0))/length(x) }), "Probability of introduced duplicates", decimals=2)
p
p <- heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q99(data$ADD), removed=q99(data$DEL), complexity = quantile(data$COMPLEX, 0.25), duplicates = quantile(data$DUP, 0.25), summary = function(x) { length(which(x>0))/length(x) }), "Probability of introduced duplicates", decimals=2)
p
p <- heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q95(data$ADD), removed=q95(data$DEL), summary = function(x) { length(which(x>0))/length(x) }), "Probability of introduced duplicates", decimals=2)
p
heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q95(data$ADD), removed=q95(data$DEL), summary=function(x) q95(x)), "Quantile 95%", decimals=0)
heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q95(data$ADD), removed=q95(data$DEL), summary=function(x) q99(x)), "Quantile 99%", decimals=0)
heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q95(data$ADD), removed=q95(data$DEL), complexity=q95(data$COMPLEX), duplicates = q95(data$DUP), summary=function(x) q99(x)), "Quantile 99%", decimals=0)
heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q99(data$ADD), removed=q99(data$DEL), complexity=q99(data$COMPLEX), duplicates = q99(data$DUP), summary=function(x) quantile(x, 0.75)), "Quantile 75%", decimals=0)
heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=quantile(data$ADD, 0.99), removed=quantile(data$DEL, 0.1), complexity=mean(data$COMPLEX), duplicates = mean(data$DUP), summary=function(x) median(x)), "Median a posteriori", decimals=0)
summary(m)
## Family: zero_inflated_negbinomial
## Links: mu = log; shape = identity; zi = logit
## Formula: y ~ 1 + A + R + C + D + (1 + R + C + D | team) + (1 + R + C + D | team:repo)
## zi ~ 1 + A + R + C + D + (1 + R + C + D | team) + (1 + R + C + D | team:repo)
## Data: d (Number of observations: 31007)
## Draws: 4 chains, each with iter = 4000; warmup = 1000; thin = 1;
## total post-warmup draws = 12000
##
## Group-Level Effects:
## ~team (Number of levels: 11)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## sd(Intercept) 0.25 0.11 0.06 0.49 1.00 3408
## sd(R) 0.17 0.08 0.04 0.36 1.00 5339
## sd(C) 0.21 0.09 0.05 0.41 1.00 5345
## sd(D) 0.11 0.06 0.02 0.27 1.00 6498
## sd(zi_Intercept) 0.15 0.08 0.03 0.33 1.00 7160
## sd(zi_R) 0.26 0.11 0.06 0.50 1.00 4323
## sd(zi_C) 0.17 0.09 0.03 0.38 1.00 5740
## sd(zi_D) 0.15 0.09 0.03 0.35 1.00 6821
## cor(Intercept,R) 0.14 0.36 -0.58 0.77 1.00 8427
## cor(Intercept,C) 0.06 0.36 -0.65 0.73 1.00 7264
## cor(R,C) 0.11 0.36 -0.61 0.74 1.00 7527
## cor(Intercept,D) 0.14 0.38 -0.61 0.78 1.00 10810
## cor(R,D) -0.02 0.38 -0.72 0.70 1.00 10547
## cor(C,D) 0.01 0.38 -0.71 0.71 1.00 9248
## cor(zi_Intercept,zi_R) 0.00 0.38 -0.71 0.71 1.00 4565
## cor(zi_Intercept,zi_C) -0.11 0.38 -0.77 0.65 1.00 9848
## cor(zi_R,zi_C) 0.09 0.38 -0.65 0.76 1.00 9599
## cor(zi_Intercept,zi_D) 0.00 0.38 -0.70 0.72 1.00 9116
## cor(zi_R,zi_D) -0.14 0.38 -0.79 0.63 1.00 9162
## cor(zi_C,zi_D) -0.09 0.38 -0.76 0.66 1.00 8454
## Tail_ESS
## sd(Intercept) 4272
## sd(R) 6096
## sd(C) 5512
## sd(D) 6839
## sd(zi_Intercept) 6955
## sd(zi_R) 5679
## sd(zi_C) 6057
## sd(zi_D) 6914
## cor(Intercept,R) 9542
## cor(Intercept,C) 7527
## cor(R,C) 8817
## cor(Intercept,D) 9075
## cor(R,D) 9656
## cor(C,D) 9466
## cor(zi_Intercept,zi_R) 7256
## cor(zi_Intercept,zi_C) 8522
## cor(zi_R,zi_C) 9515
## cor(zi_Intercept,zi_D) 9114
## cor(zi_R,zi_D) 9053
## cor(zi_C,zi_D) 9630
##
## ~team:repo (Number of levels: 84)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## sd(Intercept) 0.47 0.07 0.33 0.62 1.00 4365
## sd(R) 0.32 0.06 0.21 0.45 1.00 3309
## sd(C) 0.34 0.09 0.18 0.52 1.00 2242
## sd(D) 0.21 0.05 0.12 0.33 1.00 4147
## sd(zi_Intercept) 0.59 0.08 0.44 0.75 1.00 5222
## sd(zi_R) 0.35 0.08 0.21 0.52 1.00 3673
## sd(zi_C) 0.40 0.11 0.19 0.64 1.00 2234
## sd(zi_D) 0.36 0.08 0.21 0.54 1.00 4242
## cor(Intercept,R) -0.22 0.21 -0.62 0.20 1.00 2999
## cor(Intercept,C) -0.15 0.23 -0.56 0.33 1.00 4142
## cor(R,C) 0.25 0.24 -0.25 0.70 1.00 3103
## cor(Intercept,D) 0.01 0.25 -0.49 0.49 1.00 5110
## cor(R,D) -0.31 0.26 -0.76 0.25 1.00 5429
## cor(C,D) -0.28 0.29 -0.79 0.32 1.00 3359
## cor(zi_Intercept,zi_R) -0.35 0.21 -0.73 0.07 1.00 4222
## cor(zi_Intercept,zi_C) -0.47 0.20 -0.82 -0.03 1.00 4486
## cor(zi_R,zi_C) 0.18 0.29 -0.38 0.73 1.00 2621
## cor(zi_Intercept,zi_D) 0.07 0.22 -0.37 0.49 1.00 5593
## cor(zi_R,zi_D) 0.15 0.26 -0.36 0.64 1.00 4259
## cor(zi_C,zi_D) 0.02 0.30 -0.54 0.60 1.00 2180
## Tail_ESS
## sd(Intercept) 5681
## sd(R) 6079
## sd(C) 4929
## sd(D) 7345
## sd(zi_Intercept) 7108
## sd(zi_R) 6488
## sd(zi_C) 3582
## sd(zi_D) 6576
## cor(Intercept,R) 5823
## cor(Intercept,C) 6628
## cor(R,C) 4437
## cor(Intercept,D) 7430
## cor(R,D) 7367
## cor(C,D) 5783
## cor(zi_Intercept,zi_R) 5954
## cor(zi_Intercept,zi_C) 7074
## cor(zi_R,zi_C) 4575
## cor(zi_Intercept,zi_D) 8124
## cor(zi_R,zi_D) 6949
## cor(zi_C,zi_D) 5469
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept -1.51 0.15 -1.80 -1.21 1.00 4476 6927
## zi_Intercept 1.90 0.16 1.58 2.20 1.00 4608 5601
## A 1.07 0.05 0.98 1.16 1.00 6999 8776
## R 0.09 0.09 -0.09 0.28 1.00 5405 7117
## C 0.30 0.12 0.05 0.53 1.00 4636 7122
## D 0.29 0.08 0.12 0.45 1.00 7159 8364
## zi_A -1.16 0.06 -1.27 -1.05 1.00 8230 8802
## zi_R 0.43 0.12 0.16 0.65 1.00 5160 7483
## zi_C 0.33 0.13 0.06 0.59 1.00 4471 6632
## zi_D -0.34 0.10 -0.54 -0.13 1.00 6227 8337
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## shape 1.04 0.09 0.88 1.22 1.00 8533 8853
##
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
source("conditional_effects.R")
plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Blue", "Jupiter", added=q95(data$ADD), removed=q95(data$DEL)), "Blue", "Jupiter")
(red_jupiter_plot <- plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Jupiter", added=q95(data$ADD), removed=q95(data$DEL)), "Red", "Jupiter") + labs(title=NULL, subtitle=NULL, legend=NULL) + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10)) + theme_bw() + scale_y_continuous(limits=c(0,50))
)
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
(red_uranus_plot <- plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Uranus", added=q95(data$ADD), removed=q95(data$DEL)), "Red", "Uranus") + labs(title=NULL, subtitle=NULL) + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10)) + theme_bw() + ylab(NULL) + scale_y_continuous(limits=c(0,50))
)
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
(red_plot <- ggarrange(red_jupiter_plot, red_uranus_plot, common.legend = T, legend="bottom"))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
figsave("red_jupiter_uranus.pdf", red_plot)
plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Green", "Jupiter", added=q99(data$ADD), removed=q95(data$DEL)), "Green", "Jupiter")
(blue_jupiter_plot <- plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Blue", "Jupiter", added=q99(data$ADD), removed=q95(data$DEL)), "Blue", "Jupiter") + labs(title=NULL, subtitle=NULL) + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10)) + scale_y_continuous(limits=c(0,260)) + theme(legend.position = "none") + theme_bw()
)
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
(blue_uranus_plot <- plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Blue", "Uranus", added=q99(data$ADD), removed=q95(data$DEL)), "Blue", "Uranus") + labs(title=NULL, subtitle=NULL) + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10)) + scale_y_continuous(limits=c(0,260)) + ylab(NULL) + theme_bw()
)
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
(blue_plot <- ggarrange(blue_jupiter_plot, blue_uranus_plot, common.legend = T, legend="bottom"))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
figsave("blue_jupiter_uranus.pdf", blue_plot)
(complexity_plot <- ggarrange(plotlist = list(blue_jupiter_plot, blue_uranus_plot, red_jupiter_plot, red_uranus_plot), labels="AUTO", nrow=2, ncol=2, common.legend = T, legend="bottom")
)
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
figsave("blue_red_complexity.pdf", complexity_plot)
plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Uranus", added=q99(data$ADD), removed=q95(data$DEL)), "Red", "Uranus") + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
(red_venus_plot <- plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Venus", added=q99(data$ADD), removed=q95(data$DEL)), "Red", "Venus") + labs(title=NULL, subtitle=NULL) + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10))
)
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Neptune", added=q99(data$ADD), removed=q95(data$DEL)), "Red", "Neptune") + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Saturn", added=q99(data$ADD), removed=q95(data$DEL)), "Red", "Saturn") + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "IntTest", added=q99(data$ADD), removed=q95(data$DEL)), "Red", "IntTest") + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Mars", added=q99(data$ADD), removed=q95(data$DEL)), "Red", "Mars") + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Mercury", added=q99(data$ADD), removed=q95(data$DEL)), "Red", "Mercury") + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
data |> filter(committerteam=="Blue", repo=="Uranus") |> group_by(INTROD) |> tally() |> arrange(desc(n))
## # A tibble: 7 × 2
## INTROD n
## <int> <int>
## 1 0 968
## 2 1 44
## 3 2 23
## 4 3 5
## 5 4 4
## 6 5 2
## 7 17 1
data |> filter(committerteam=="Red", repo=="Jupiter") |> group_by(INTROD) |> tally() |> arrange(INTROD)
## # A tibble: 13 × 2
## INTROD n
## <int> <int>
## 1 0 2080
## 2 1 35
## 3 2 18
## 4 3 2
## 5 4 9
## 6 5 4
## 7 6 3
## 8 7 8
## 9 8 2
## 10 9 2
## 11 19 1
## 12 22 1
## 13 23 1
data |> filter(committerteam=="Red", repo=="Venus") |> group_by(INTROD) |> tally() |> arrange(INTROD)
## # A tibble: 6 × 2
## INTROD n
## <int> <int>
## 1 0 214
## 2 1 5
## 3 2 7
## 4 5 2
## 5 6 1
## 6 11 2
data |> filter(committerteam=="Red", repo=="Uranus") |> group_by(INTROD) |> tally() |> arrange(INTROD)
## # A tibble: 5 × 2
## INTROD n
## <int> <int>
## 1 0 348
## 2 1 19
## 3 2 11
## 4 3 7
## 5 4 1
data |> filter(committerteam=="Red", repo=="Venus") |> tally()
## n
## 1 231
data |> filter(committerteam=="Red", repo=="Venus") |> group_by(trunc(COMPLEX/5)) |> tally()
## # A tibble: 32 × 2
## `trunc(COMPLEX/5)` n
## <dbl> <int>
## 1 0 40
## 2 1 18
## 3 2 19
## 4 3 26
## 5 4 19
## 6 5 16
## 7 6 8
## 8 7 2
## 9 8 5
## 10 9 7
## # ℹ 22 more rows
data |> filter(committerteam=="Red", repo=="Jupiter") |> tally()
## n
## 1 2166
data |> filter(committerteam=="Red", repo=="Jupiter") |> group_by(trunc(COMPLEX/5)) |> tally()
## # A tibble: 89 × 2
## `trunc(COMPLEX/5)` n
## <dbl> <int>
## 1 0 413
## 2 1 301
## 3 2 223
## 4 3 179
## 5 4 131
## 6 5 138
## 7 6 77
## 8 7 67
## 9 8 62
## 10 9 33
## # ℹ 79 more rows
data |> filter(committerteam=="Red") |> group_by(repo) |> tally() |> arrange(desc(n))
## # A tibble: 8 × 2
## repo n
## <fct> <int>
## 1 Jupiter 2166
## 2 Saturn 1191
## 3 IntTest 617
## 4 Uranus 386
## 5 Venus 231
## 6 Neptune 215
## 7 Mercury 211
## 8 Mars 130
data |> filter(committerteam=="Blue", repo=="Venus") |> group_by(INTROD) |> tally() |> arrange(INTROD)
## # A tibble: 7 × 2
## INTROD n
## <int> <int>
## 1 0 654
## 2 1 8
## 3 2 4
## 4 3 1
## 5 4 1
## 6 5 1
## 7 8 1
data |> filter(committerteam=="Blue", repo=="Uranus") |> group_by(INTROD) |> tally() |> arrange(INTROD)
## # A tibble: 7 × 2
## INTROD n
## <int> <int>
## 1 0 968
## 2 1 44
## 3 2 23
## 4 3 5
## 5 4 4
## 6 5 2
## 7 17 1
data |> filter(COMPLEX==max(COMPLEX))
## repo commit file ISTEST ISNEW ISREMOVED
## 1 Jupiter 52efadf06bb61941887114298aa281c277e1f8a0 910 FALSE FALSE FALSE
## 2 Jupiter 6de4ea0ed3207a898be965f3901efc5f0339bc0e 910 FALSE FALSE FALSE
## author authorteam
## 1 8ff0b36b3305550dc55c9097f3b970c599959326 Yellow
## 2 230d4e806215564296e126c9b532f1936adaaa89 Red
## committer committerteam churn addedComplexity
## 1 0e33fff6868e21d94d344c4bf2fcd81285b2c3e2 Arch 3 0
## 2 230d4e806215564296e126c9b532f1936adaaa89 Red 113 10
## removedComplexity cADD cDEL cSIZE cINTROD cREMOVED ADD DEL REASON CLOC
## 1 0 582 879 1461 0 0 3 3 I 9443
## 2 0 135 34 169 0 0 113 32 F 9443
## COMPLEX DUP logcADD logcDEL logcSIZE logcREMOVED logADD logDEL
## 1 1244 25 6.368187 6.779922 7.287561 0 1.386294 1.386294
## 2 1244 25 4.912655 3.555348 5.135798 0 4.736198 3.496508
## logCOMPLEX logDUP INTROD REMOVED A R C scaleDUP
## 1 7.126891 3.258097 0 0 -0.4751701 -0.1458825 2.537155 0.5413155
## 2 7.126891 3.258097 0 0 1.6152839 1.2642792 2.537155 0.5413155
## D cA cR cSZ cREM cREMlog
## 1 2.058662 0.67129576 1.044691 0.6849683 -0.2146616 -0.455959
## 2 2.058662 0.03418981 -0.252292 -0.3619011 -0.2146616 -0.455959
teamBlue <- condeffect_logADD_by_logCOMPLEX(m, d, "Blue", "Jupiter")
teamArch <- condeffect_logADD_by_logCOMPLEX(m, d, "Arch", "Jupiter")
plot_logADD_by_logCOMPLEX(m, d, teamBlue)
library(forcats)
reloo_plot_logADD_by_logCOMPLEX <- function(reloo, someData, ftot, aTeam, aRepo) {
tmp <- bind_cols(someData, reloo$diagnostics) |> mutate(
truncC=factor(trunc(unscale_complexity(round(C)))),
added=unscale_added(A),
complexity=factor(trunc(unscale_complexity(round(C)))))
sorted_labels <- paste(sort(as.integer(levels(tmp$truncC))))
tmp$truncC <- factor(tmp$truncC, levels = sorted_labels)
tmp$complexity <- factor(tmp$complexity, levels = sorted_labels)
observed <- tmp |> filter(team == aTeam, repo == aRepo)
tmp <- ftot |> mutate(added=unscale_added(A),
complexity=factor(trunc(unscale_complexity(as.integer(C))), levels=trunc(unscale_complexity(C))))
predicted <- tmp
return(predicted |> ggplot(aes(x=added)) +
geom_smooth(aes(y=Estimate, ymin=Q5.5, ymax=Q94.5, group=complexity, color=complexity), stat="identity", alpha=.25, linewidth=.5) +
geom_point(data=observed, aes(y=y, size = pareto_k, color=truncC), alpha=0.2) +
ggtitle(paste0("Conditional effects of team ", aTeam, " in repo ", aRepo))
)
}
# ! factor level [6] is duplicated
#reloo_plot_logADD_by_logCOMPLEX(reloo, d, teamBlue, "Blue", "Jupiter")
teamBlue |> mutate(added=unscale_added(A), complexity=factor(trunc(unscale_complexity(as.integer(C))))) |> select(complexity) |> summary()
## complexity
## 14 :6000
## 489:4000
d |> sample_n(10) |> select(y, A, C, team, repo) |> mutate(foo = trunc(unscale_complexity(round(C))))
## y A C team repo foo
## 1 2 1.46776930 -0.2997277 Orange Jupiter 14
## 2 0 1.34561803 2.0975739 Red IntTest 489
## 3 0 0.89597273 1.5101936 Violet IntTest 489
## 4 0 -0.04262272 1.2796068 Brown Saturn 85
## 5 0 -0.33592097 0.3931244 Yellow Venus 14
## 6 0 -0.65469353 0.3931244 Blue Mars 14
## 7 0 1.07549612 -0.5330614 Blue Saturn 2
## 8 0 0.42775652 -0.6379823 Brown Saturn 2
## 9 0 -0.04262272 0.9503251 Blue Uranus 85
## 10 1 1.00685182 0.3931244 Blue Jupiter 14
#mutate(truncC=factor(trunc(unscale_complexity(round(C, 0)))), added=unscale_added(A), complexity=factor(trunc(unscale_complexity(round(C, 0))))) |> select(complexity)
plot_logADD_by_logCOMPLEX(m, d, teamArch)
teamBrown <- condeffect_logADD_by_logCOMPLEX(m, d, "Brown", "IntTest")
plot_logADD_by_logCOMPLEX(m, d, teamBrown)
teamUI <- condeffect_logADD_by_logCOMPLEX(m, d, "UI", "Jupiter")
plot_logADD_by_logCOMPLEX(m, d, teamUI)
(p <- mcmc_areas_ridges(m, regex_pars = c("^b_")) + theme_bw() + ggtitle("Population-level beta parameter distributions"))
(p <- mcmc_areas(m, regex_pars = c("^b_")) + theme_bw() + ggtitle("Population-level beta parameter distributions"))
figsave("pop_level_betas.pdf", p)
mcmc_areas(m, regex_pars = c("^sd_"))
mcmc_areas(m, regex_pars = c("^r_team[[].*,Intercept"))
mcmc_areas(m, regex_pars = c("^r_team.*[[]Red,.*[]]"))
mcmc_areas(m, regex_pars = c("^r_team:repo.*[[].*Red_IntTest,.*[]]"))
plot_logCOMPLEX_by_logADD(m, d, condeffect_logCOMPLEX_by_logADD(m, d, "Brown", "IntTest")) + scale_x_continuous(trans="log1p", breaks=c(0,3,10,50,200, 1000))
plot_logCOMPLEX_by_logADD(m, d, condeffect_logCOMPLEX_by_logADD(m, d, "Blue", "IntTest", robust=T)) + scale_x_continuous(trans="log1p", breaks=c(0,3,10,50,200, 1000))
plot_logCOMPLEX_by_logADD(m, d, condeffect_logCOMPLEX_by_logADD(m, d, "Red", "IntTest")) + scale_x_continuous(trans="log1p", breaks=c(0,3,10,50,200, 1000))
plot_logCOMPLEX_by_logADD(m, d, condeffect_logCOMPLEX_by_logADD(m, d, "Blue", "Neptune")) + scale_x_continuous(trans="log1p", breaks=c(0,3,10,50,200, 1000))
plot_logCOMPLEX_by_logADD(m, d, condeffect_logCOMPLEX_by_logADD(m, d, "Blue", "Uranus")) + scale_x_continuous(trans="log1p", breaks=c(0,3,10,50,200, 1000))
plot_logCOMPLEX_by_logADD(m, d, condeffect_logCOMPLEX_by_logADD(m, d, "Red", "Jupiter")) + scale_x_continuous(trans="log1p", breaks=c(0,3,10,50,200, 1000))
plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Blue", "IntTest", removed=200, duplicates=20)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))
plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Red", "IntTest", removed=200, duplicates=20)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))
plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Blue", "Neptune", removed=200, duplicates=20)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))
(p <- plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Green", "Jupiter", removed=q95(data$DEL), duplicates=q95(data$DUP))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw() + ylab("Estimated new duplicates") + xlab("Added lines (log scale)") + scale_y_continuous(limits=c(0,215))
)
figsave("condeffect_added_vs_complex_green_jupiter.pdf", p)
(p <- plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Blue", "Jupiter", removed=q95(data$DEL), duplicates=q95(data$DUP))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw() + ylab("Estimated new duplicates") + xlab("Added lines (log scale)") + scale_y_continuous(limits=c(0,250))
)
(p <- plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Red", "Jupiter", removed=q95(data$DEL), duplicates=q95(data$DUP))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw() + ylab("Estimated new duplicates") + xlab("Added lines (log scale)") + scale_y_continuous(limits=c(0,250))
)
figsave("condeffect_added_vs_complex_red_jupiter.pdf", p)
plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Blue", "Jupiter", removed=200, duplicates=20)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) #+ scale_y_continuous(limits=c(0,175))
plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Red", "Jupiter", removed=200, duplicates=20)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))+ scale_y_continuous(limits=c(0,175))
plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Blue", "Neptune", removed=200, complexity=q95(data$COMPLEX))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) #+ scale_y_continuous(limits=c(0,175))
plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Red", "Neptune", removed=200, complexity=q95(data$COMPLEX))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))
plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Blue", "Jupiter", removed=200, complexity=q95(data$COMPLEX))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw()
jupiter_stats <- data |> filter(repo == "Jupiter") |> summarize(remQ95 = q95(DEL), compQ95 = q95(COMPLEX), remQ99 = q99(DEL), compQ99 = q99(COMPLEX))
plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Red", "Jupiter", removed=jupiter_stats$remQ95, complexity = jupiter_stats$compQ95)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw()
plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Blue", "IntTest", removed=jupiter_stats$remQ95, complexity=jupiter_stats$compQ95)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw() + scale_y_continuous(limits = c(0,400))
plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Red", "IntTest", removed=q95(data$DEL), complexity=q95(data$COMPLEX))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw()
plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Brown", "IntTest")) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw()
plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Newbies", "IntTest")) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))
plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "NewTeam", "Saturn")) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))
plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Red", "Saturn", removed=0, duplicates = 0)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))
plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Green", "Saturn", removed=q95(data$DUP), duplicates = q95(data$DUP))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))
plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Orange", "Venus", removed=0, duplicates = 0)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))
plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Yellow", "Venus", removed=0, duplicates=0)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))
plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Green", "Venus", removed=0, duplicates=0)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))
plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Brown", "Jupiter", removed=q95(data$DEL), complexity = q95(data$COMPLEX))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))
plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Newbies", "Jupiter", removed=q95(data$DEL), complexity = q95(data$COMPLEX))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))
source("predict_utils.R")
Q95 change in a highly complex file
(p <- plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Arch", "Blue", "Red", "NewTeam", "Green"), repo=c("Jupiter", "IntTest", "Aniara", "Venus"), added=q95(data$ADD), removed=q95(data$DEL), duplicates=q95(data$DUP), complex=q95(data$COMPLEX))) + scale_y_continuous(limits = c(0.25,1.0)) + scale_x_continuous(limits = c(0,30)) )
## Warning: Removed 4713 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_step()`).
duplicates_probability_for_team <- function(predictions, aTeam) {
params = predictions |> select(added, removed, complexity, duplicates) |> distinct()
stopifnot(length(params$added) == 1)
predictions |> filter(team == aTeam) |> ggplot(aes(x=repo, y=pred0/100, color=team)) + geom_boxplot() + scale_color_manual(values=COLOR_BY_TEAM) + theme_bw() +
ggtitle(paste("Probability of any duplicate per repository for team", aTeam),
paste("added", round(params$added, 0), "removed", round(params$removed, 0), "complexity", round(params$complexity, 0), "duplicates", round(params$duplicates, 0))) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ylab("P(INTROD > 0)") + scale_y_continuous(limits=c(0,1))
}
more_five_probability_for_team <- function(predictions, aTeam) {
params = predictions |> select(added, removed, complexity, duplicates) |> distinct()
stopifnot(length(params$added) == 1)
predictions |> filter(team == aTeam) |> ggplot(aes(x=repo, y=pred5/100, color=team)) + geom_boxplot() + scale_color_manual(values=COLOR_BY_TEAM) + theme_bw() +
ggtitle(paste("Probability of more than five duplicates per repository for team", aTeam),
paste("added", round(params$added, 0), "removed", round(params$removed, 0), "complexity", round(params$complexity, 0), "duplicates", round(params$duplicates,0))) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ylab("P(INTROD > 5)") + scale_y_continuous(limits=c(0,1))
}
pred <- onepred(added=q95(data$ADD), removed=q95(data$DEL), complexity = q95(data$COMPLEX), duplicates=q95(data$DUP))
(p <- duplicates_probability_for_team(pred, "Blue"))
(p <- more_five_probability_for_team(pred, "Blue"))
Observed number of introduced duplicates.
(p <- data |> filter(repo == MARS, committerteam == GREEN) |> ggplot(aes(x=DUP, y=COMPLEX, size=INTROD, color=INTROD)) + geom_point() + geom_jitter() + scale_y_continuous(breaks=c(0,1,2,5, 10, 25, 50, 75, 100, 250, 500, 750), trans="log1p") + scale_x_continuous(breaks=c(0,1,2,5, 10, 25, 50, 75, 100), trans="log1p") + scale_color_distiller(palette = "Spectral") + theme_bw()
)
data |> filter(repo == MARS) |> group_by(committerteam) |> filter(INTROD == max(INTROD)) |> select(committerteam, INTROD, DUP) |> arrange(desc(INTROD))
## # A tibble: 28 × 3
## # Groups: committerteam [10]
## committerteam INTROD DUP
## <fct> <int> <int>
## 1 Brown 24 86
## 2 Blue 14 72
## 3 Green 10 0
## 4 Violet 5 0
## 5 Yellow 2 1
## 6 Yellow 2 1
## 7 Red 2 0
## 8 Yellow 2 0
## 9 Orange 1 1
## 10 Orange 1 1
## # ℹ 18 more rows
(p <- data |> filter(repo == MARS, committerteam == GREEN) |> group_by(INTROD) |> ggplot(aes(x=DUP, group=INTROD, fill=INTROD)) + geom_bar(position="stack") + scale_x_continuous(breaks=c(0,1,2,5, 10, 25, 50, 75, 100), trans="log1p") + scale_fill_distiller(palette = "Spectral") + theme_bw()
)
plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Arch", "Blue", "Red", "NewTeam", "Green"), repo=c("Uranus", "Jupiter", "IntTest", "Mars"), added=q95(data$ADD), removed=q95(data$DEL), duplicates=q95(data$DUP), complex=q95(data$COMPLEX))) + scale_y_continuous(limits = c(0.50,1.0)) + scale_x_continuous(limits = c(0,30))
## Warning: Removed 3226 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_step()`).
allrepos <- c("Aniara", "Jupiter", "IntTest", "Mercury", "Venus", "Mars", "Saturn", "Neptune", "Uranus")
(p <- plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Arch", "Blue", "Red", "NewTeam", "Brown"), repo=allrepos, added=q95(data$ADD), removed=q95(data$DEL), duplicates=q95(data$DUP), complex=q95(data$COMPLEX))) + scale_y_continuous(limits = c(0.20,1.0)) + scale_x_continuous(limits = c(0,30)))
## Warning: Removed 10238 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_step()`).
(p <- plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Arch", "Blue", "Red", "NewTeam", "Green"), repo=c("Uranus", "Jupiter", "IntTest"), added=q95(data$ADD), removed=q95(data$DEL), duplicates=median(data$DUP), complex=median(data$COMPLEX))) + scale_y_continuous(limits = c(0.70,1.0)) + scale_x_continuous(limits = c(0,30)))
## Warning: Removed 41 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_step()`).
plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Arch", "Blue", "Red", "NewTeam"), repo=c("Uranus", "Jupiter", "IntTest"), added=q95(data$ADD), removed=q95(data$DEL), duplicates=20, complex=170)) + scale_y_continuous(limits = c(0.35,1)) + scale_x_continuous(limits = c(0,30))
## Warning: Removed 1225 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_step()`).
data |> summarize(q95(ADD), q95(DEL), q95(COMPLEX), q95(DUP))
## q95(ADD) q95(DEL) q95(COMPLEX) q95(DUP)
## 1 143 92 282 36
data |> group_by(repo) |> summarize(q95(ADD), q95(DEL), q95(COMPLEX), q95(DUP))
## # A tibble: 8 × 5
## repo `q95(ADD)` `q95(DEL)` `q95(COMPLEX)` `q95(DUP)`
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 IntTest 216 127. 558 110
## 2 Jupiter 120 74.4 177 19
## 3 Mars 146. 114 154. 5
## 4 Mercury 144 106 282 6
## 5 Neptune 231 156 180 9
## 6 Saturn 101. 72.6 320 9
## 7 Uranus 158. 95.9 89.9 4
## 8 Venus 154 84.5 132 4
data |> group_by(repo) |> summarize(q99(ADD), q99(DEL), q99(COMPLEX), q99(DUP))
## # A tibble: 8 × 5
## repo `q99(ADD)` `q99(DEL)` `q99(COMPLEX)` `q99(DUP)`
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 IntTest 638. 511. 633 620.
## 2 Jupiter 298. 246. 1056 32
## 3 Mars 316. 557. 661 72
## 4 Mercury 304. 283. 374. 16.9
## 5 Neptune 511 474 216 28
## 6 Saturn 239 209. 758 18
## 7 Uranus 291. 273. 126. 9
## 8 Venus 433 374. 260 11
Strange that teams in the Neptune repo seems very simlar for this change? Neptune repo is mostly static, no team owning it. Mostly abandoned code.
plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Blue", "Red", "NewTeam", "Orange"), repo=c("Uranus", "Jupiter", "Venus"), added=q95(data$ADD), removed=q95(data$DEL))) + scale_y_continuous(limits = c(0.6,1)) + scale_x_continuous(limits = c(0,20))
## Warning: Removed 359 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_step()`).
plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Blue", "Red", "NewTeam", "Orange"), repo=c("Uranus", "Jupiter", "IntTest"), added=mean(data$ADD), removed=mean(data$DEL))) + scale_y_continuous(limits = c(0.2,1.02)) + scale_x_continuous(limits = c(0,20))
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_step()`).
plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Blue", "Red", "Arch", "NewTeam"), repo=c("Uranus", "Jupiter", "IntTest"), added=q95(data$ADD), removed=q95(data$DEL), complexity = mean(data$COMPLEX), duplicates=mean(data$DUP))) + scale_y_continuous(limits = c(0.2,1.0)) + scale_x_continuous(limits = c(0,30))
## Warning: Removed 328 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_step()`).
plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Blue", "Red", "Arch", "NewTeam"), repo=c("Uranus", "Jupiter", "IntTest"), added=q95(data$ADD), removed=q95(data$DEL), complexity = q95(data$COMPLEX), duplicates=q95(data$DUP))) + scale_y_continuous(limits = c(0.3,1.0)) + scale_x_continuous(limits = c(0,30))
## Warning: Removed 2114 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_step()`).
plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Green", "Yellow", "Orange", "NewTeam"), repo=c("Venus", "Jupiter", "IntTest"), added=q95(data$ADD), removed=q95(data$DEL), complexity = q95(data$COMPLEX), duplicates=q95(data$DUP))) + scale_y_continuous(limits = c(0.5,1.0)) + scale_x_continuous(limits = c(0,30))
## Warning: Removed 2403 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_step()`).
plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Arch", "Blue", "Brown","Red", "Green", "Yellow", "Orange", "NewTeam"), repo=c("Venus", "Jupiter", "IntTest"), added=q95(data$ADD), removed=q95(data$DEL), complexity = q95(data$COMPLEX), duplicates=q95(data$DUP))) + scale_y_continuous(limits = c(0.2,1.0)) + scale_x_continuous(limits = c(0,30))
## Warning: Removed 6318 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_step()`).
plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Arch", "Blue", "Brown","Red", "Green", "Yellow", "Orange", "NewTeam"), repo=c("Venus", "Jupiter", "IntTest"), added=q99(data$ADD), removed=q99(data$DEL), complexity = median(data$COMPLEX), duplicates=median(data$DUP))) + scale_y_continuous(limits = c(0.5,1.0)) + scale_x_continuous(limits = c(0,20))
## Warning: Removed 2651 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_step()`).